STA 313 - Spring 2024 - HW 1

Viz dat, wrangle dis!

Author

Sana Pashankar

library(tidyverse)
library(openintro)
library(scales)
library(ggforce)
library(ggrepel)
library(gridExtra)
library(grid)
data("nyc_marathon")
# add other packages as needed or remove comment

1 - NYC marathon winners.

#a. Create a histogram and a box plot of the distribution of marathon times of all runners in the dataset. What features of the distribution are apparent in the histogram and not the box plot? What features are apparent in the box plot but not in the histogram?

nyc_marathon <- nyc_marathon %>%
  filter(!is.na(time_hrs)) 

ggplot(nyc_marathon, aes(x = time_hrs)) + 
  geom_histogram(binwidth = 0.05, fill = "skyblue4") + 
  labs(x = "Finishing time (hours)", y = "Number of Runners", 
       title = "Distribution of Finishing Times for the NYC Marathon") +
   theme_minimal() 

ggplot(nyc_marathon, aes(x = time_hrs)) + 
  geom_boxplot() + 
  labs(x = "Finishing time (hours)", y = "Proportion of runners", 
       title = "Distribution of Finishing Times for the NYC Marathon") +
   theme_minimal() 

In the histogram, you can see the number of runners who achieved a similar finishing time, whereas in the box plot you can see the proportion of runners who achieved a similar finishing time. The boxplot more clearly identifies a set range of finishing time and the average time of the runners. The histogram more clearly identifies how many runners achieved a specific time.

#b.Create a side-by-side box plots of marathon times for men and women. Use different colors for the each of the box plots – do not use the default colors, but instead manually define them (you can choose any two colors you want). Based on the plots you made, compare the distribution of marathon times for men and women.

ggplot(nyc_marathon, aes(x = time_hrs, color = division)) + 
  geom_boxplot() + 
  labs(x = "Finishing time (hours)", y = "Proportion of runners", 
       title = "Distribution of Finishing Times for the NYC Marathon", 
       color = "Division") +
   theme_minimal() + 
  scale_color_manual(values = c("skyblue", "magenta"))

#d. d. Visualize the marathon times of men and women over the years. As is usual with time series plot, year should go on the x-axis. Use different colors and shapes to represent the times for men and women. Make sure your colors match those in the previous part. Once you have your plot, describe what is visible in this plot but not in the others.

nyc_marathon_years <- nyc_marathon %>%
  group_by(year, division) %>%
  summarize(
    mean = mean(time_hrs)
  )
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
ggplot(nyc_marathon_years, aes(x = year, y = mean, color = division)) + 
  geom_line() + 
  geom_point(aes(color = division, shape = division)) + 
  scale_shape_manual(values = c("circle", "square")) + 
  scale_color_manual(values = c("skyblue", "magenta")) + 
  theme_minimal() +
  labs(x = "Year", y = "Average Finishing Time (hours)", 
       title = "Average Finishing Times for the NYC Marathon Over Time", 
       color = "Division", 
       shape = NA) + 
  guides(color = guide_legend("Division"), shape = guide_legend("Division"))

In this plot, you can see how the average finishing times of men and women has fluctuated over the years. You can see how the finishing times have decreased over time and how they have changed based on global events (COVID-19 in 2020).

2 - US counties.

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership,
                                                     y = poverty)) +
  geom_point() + 
  labs(title = "Plot A", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership,
                                                     y = poverty)) +
  geom_point() + 
  geom_smooth(se = FALSE) +
  labs(title = "Plot B", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership, y = poverty)) +
  geom_point() + 
  geom_smooth(aes(group = metro), color = "green", se = FALSE) +
  labs(title = "Plot C", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership, y = poverty)) +
  geom_smooth(aes(group = metro), color = "blue", se = FALSE) +
  geom_point() + 
  labs(title = "Plot D", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership, y = poverty, color = metro)) +
  geom_point() +
  geom_smooth(aes(linetype = metro), color = "blue", se = FALSE) +
  labs(title = "Plot E", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership, y = poverty, color = metro)) +
  geom_point() +
  geom_smooth(aes(color = metro), se = FALSE) +
  labs(title = "Plot F", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership, y = poverty)) +
  geom_point(aes(color = metro)) +
  geom_smooth(se = FALSE) +
  labs(title = "Plot G", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(county %>% filter(!is.na(homeownership)), aes(x = homeownership, y = poverty)) +
  geom_point(aes(color = metro)) +
  labs(title = "Plot H", 
       x = "Home ownership rate", 
       y = "Percent of population in poverty")
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

3 - Credit card balances.

credit <- read_csv("data/credit.csv")
ggplot(credit, aes(x = income, y = balance)) +
  geom_point(aes(shape = student, color = student), alpha = 0.7) + 
  geom_smooth(aes(color = student), se = FALSE, method = "lm") + 
  facet_grid(rows = vars(student), cols = vars(married), labeller = label_both) +
  scale_y_continuous(labels = label_dollar(big.mark = ",")) +
  scale_x_continuous(labels = label_dollar(big.mark = ",", suffix = "K")) + 
  labs(x = "Income", 
       y = "Credit Card Balance") + 
  theme(legend.position = "none")
`geom_smooth()` using formula = 'y ~ x'

credit <- credit %>%
  mutate(utilization = (balance/limit) * 100)


ggplot(credit, aes(x = income, y = utilization)) +
  geom_point(aes(shape = student, color = student), alpha = 0.7) + 
  geom_smooth(aes(color = student), se = FALSE, method = "lm") + 
  facet_grid(rows = vars(student), cols = vars(married), labeller = label_both) +
  scale_y_continuous(labels = label_number(suffix = "%")) +
  scale_x_continuous(labels = label_dollar(big.mark = ",", suffix = "K")) + 
  labs(x = "Income", 
       y = "Credit Utilization") + 
  theme(legend.position = "none")
`geom_smooth()` using formula = 'y ~ x'

4 - Expect More. Plot More.

#l label: target-plot
# Example data
circles <- tibble(
  x0 = 0, y0 = 0,
  r   = 3,
)
circles2 <- tibble(
  x0 = 0, y0 = 0,
  r   = 2,
)
circles3 <- tibble(
  x0 = 0, y0 = 0,
  r   = 1,
  fill = "#E4002B"  # Target red hex + white
)

ggplot() +
  geom_circle(data = circles, aes(x0=x0, y0=y0, r=r),
              fill = "#E4002B", color = NA) +
  geom_circle(data = circles2, aes(x0=x0, y0=y0, r=r),
              fill = "white", color = NA) +
  geom_circle(data = circles3, aes(x0=x0, y0=y0, r=r),
              fill = "#E4002B", color = NA) +
  coord_fixed() +
  theme_void() +
  annotate("text", x = 0, y = -3.6, label = "TARGET",
           fontface = "bold", size = 12, color = "#E4002B")

5 - Napoleon’s march.

napoleon <- read_rds("data/napoleon.rds")
troops <- napoleon$troops
temp <- napoleon$temperatures 
cities <- napoleon$cities

#city_troops <- merge(x = napoleon$cities, y = napoleon$troops, by = c("long", "lat"), all = TRUE)
#napoleon_march <- merge(x = city_troops, y = napoleon$temperatures, by = "long", all = TRUE)

ggplot() +
  geom_path(data = troops, aes(x = long, y = lat, group = group, color = direction, size = survivors), lineend = "round") +
  geom_point(data = cities, aes(x = long, y = lat)) + 
  geom_text(data = cities, aes(x = long, y = lat, label = city), vjust = 1.5) +
  scale_size("Survivors", range = c(1, 10)) + 
  theme(legend.position = "none")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

plot_troops_cities <- last_plot()

temp <- temp %>%
  mutate(temp_label = if_else(!is.na(temp), paste0(temp, "° ", date), ""))
ggplot(temp, aes(x = long, y = temp)) + 
  geom_point() +
  geom_path() + 
  geom_text_repel(aes(label = temp_label), size = 3, vjust = -1)

plot_temp<- last_plot()
grid.arrange(plot_troops_cities, plot_temp)

plot_troops_cities + 
  coord_cartesian(xlim = c(24,38)) + 
  labs(x = NA, y = NA) + 
  guides(color = FALSE, size = FALSE) +
  theme_void()
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.

plot_troops_cities_fixed <- last_plot()
plot_temp + 
  coord_cartesian(xlim = c(24,38)) + 
  labs(x = NA, y = NA) + 
  theme_void() 

plot_temp_fixed <- last_plot()
grid.arrange(plot_troops_cities_fixed, plot_temp_fixed, nrow=2, heights=c(6, 1.2))
grid.rect(width = .99, height = .99, gp = gpar(lwd = 2, col = "gray", fill = NA))